knitr::opts_chunk$set(cache = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.5 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
attendance <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/attendance.csv")
standings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/standings.csv")
attendance_joined <- attendance %>%
left_join(standings,
by = c("year", "team_name", "team")
)
attendance_joined
attendance_joined %>%
filter(!is.na(weekly_attendance)) %>%
ggplot(aes(fct_reorder(team_name, weekly_attendance),
weekly_attendance,
fill = playoffs
)) +
geom_boxplot(outlier.alpha = 0.5) +
coord_flip() +
labs(
fill = NULL, x = NULL,
y = "Weekly NFL game attendance"
)
attendance_joined %>%
distinct(team_name, year, margin_of_victory, playoffs) %>%
ggplot(aes(margin_of_victory, fill = playoffs)) +
geom_histogram(position = "identity", alpha = 0.7) +
labs(
x = "Margin of victory",
y = "Number of teams"
)
attendance_joined %>%
mutate(week = factor(week)) %>%
ggplot(aes(week, weekly_attendance, fill = week)) +
geom_boxplot(show.legend = FALSE, outlier.alpha = 0.5) +
labs(
x = "Week of NFL season",
y = "Weekly NFL game attendance"
)
attendance_df <- attendance_joined %>%
filter(!is.na(weekly_attendance)) %>%
select(
weekly_attendance, team_name, year, week,
margin_of_victory, strength_of_schedule, playoffs
)
attendance_df
library(tidymodels)
set.seed(1234)
attendance_split <- attendance_df %>%
initial_split(strata = playoffs)
nfl_train <- training(attendance_split)
nfl_test <- testing(attendance_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm")
lm_spec
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
lm_fit <- lm_spec %>%
parsnip::fit(weekly_attendance ~ .,
data = nfl_train
)
lm_fit
## parsnip model object
##
## Fit time: 64ms
##
## Call:
## stats::lm(formula = weekly_attendance ~ ., data = data)
##
## Coefficients:
## (Intercept) team_nameBears team_nameBengals
## -81107.86 -2879.80 -4875.47
## team_nameBills team_nameBroncos team_nameBrowns
## -361.08 2805.24 -324.11
## team_nameBuccaneers team_nameCardinals team_nameChargers
## -3063.65 -6139.80 -6489.31
## team_nameChiefs team_nameColts team_nameCowboys
## 1974.33 -3392.79 6068.70
## team_nameDolphins team_nameEagles team_nameFalcons
## 139.68 1259.16 -204.17
## team_nameGiants team_nameJaguars team_nameJets
## 5447.10 -3095.46 4044.23
## team_nameLions team_namePackers team_namePanthers
## -3480.69 1114.11 1227.32
## team_namePatriots team_nameRaiders team_nameRams
## -214.20 -6324.74 -2884.85
## team_nameRavens team_nameRedskins team_nameSaints
## -398.90 6447.07 380.98
## team_nameSeahawks team_nameSteelers team_nameTexans
## -1405.89 -3567.81 264.07
## team_nameTitans team_nameVikings year
## -1118.23 -3183.08 74.73
## week margin_of_victory strength_of_schedule
## -72.83 137.58 230.74
## playoffsPlayoffs
## -427.94
rf_spec <- rand_forest(mode = "regression") %>%
set_engine("ranger")
rf_spec
## Random Forest Model Specification (regression)
##
## Computational engine: ranger
rf_fit <- rf_spec %>%
parsnip::fit(weekly_attendance ~ .,
data = nfl_train
)
rf_fit
## parsnip model object
##
## Fit time: 6.3s
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Regression
## Number of trees: 500
## Sample size: 7656
## Number of independent variables: 6
## Mtry: 2
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 74737868
## R squared (OOB): 0.08217606
In-sample prediction:
results_train <- lm_fit %>% ## linear model
predict(new_data = nfl_train) %>%
mutate(
truth = nfl_train$weekly_attendance,
model = "lm"
) %>%
bind_rows(rf_fit %>% ## random forest
predict(new_data = nfl_train) %>%
mutate(
truth = nfl_train$weekly_attendance,
model = "rf"
))
results_train
Out-of-sample prediction:
results_test <- lm_fit %>%
predict(new_data = nfl_test) %>%
mutate(
truth = nfl_test$weekly_attendance,
model = "lm"
) %>%
bind_rows(rf_fit %>%
predict(new_data = nfl_test) %>%
mutate(
truth = nfl_test$weekly_attendance,
model = "rf"
))
results_test
In-sample results:
results_train %>%
group_by(model) %>%
rmse(truth = truth, estimate = .pred)
RF >> LM
Out-of-sample results:
results_test %>%
group_by(model) %>%
rmse(truth = truth, estimate = .pred)
LM ~ RF
i.e. the FF has overfit the data, but the LM has not.
Let’s visualise this:
results_test %>%
mutate(train = "testing") %>%
bind_rows(results_train %>%
mutate(train = "training")) %>%
ggplot(aes(truth, .pred, color = model)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = 0.5) +
facet_wrap(~train) +
labs(
x = "Truth",
y = "Predicted attendance",
color = "Type of model"
)
RHS: RF is much better LHS: Roughly equal
set.seed(1234)
nfl_folds <- vfold_cv(nfl_train, strata = playoffs)
rf_res <- fit_resamples(
rf_spec,
weekly_attendance ~ .,
nfl_folds,
control = control_resamples(save_pred = TRUE)
)
rf_res %>%
collect_metrics()
rf_res
rf_res %>%
unnest(.predictions) %>%
ggplot(aes(weekly_attendance, .pred, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = 0.5) +
labs(
x = "Truth",
y = "Predicted game attendance",
color = NULL
)